import pandas as pd
import os

# 读取原始parquet文件
input_file = "30G_processed_data_new/part-00008.parquet"
df = pd.read_parquet(input_file)

# 随机抽取10000条数据
sampled_df = df.sample(n=10000, random_state=42)

# 保存为新的parquet文件
output_file = "sampled_data_2.parquet"
sampled_df.to_parquet(output_file, index=False)

print(f"已从 {input_file} 中随机抽取10000条数据，并保存到 {output_file}") 